********************************************************************************
* matching_worker.do
* Purpose: Match M&A workers with comparable non-M&A control workers using
*          propensity score matching (PSM), then construct the matched
*          worker event-study panel.
*
* Matching design:
*   - Workers are embedded in matched firm pairs (from matching_firm.do).
*     Treated workers work at M&A firms; control workers work at matched controls.
*   - Matching year window: 2005-2016 (one year later than firm matching
*     because the worker requires firm_matched_list from the prior year)
*   - Eligibility restrictions:
*       * Total annual earnings >= ~CAD 3,900 real (2011 dollars)
*       * Not a moonlighter (no simultaneous jobs in any year)
*       * No gaps in employment (max_gap == 1, continuous tenure)
*       * Present at the matched firm in t=0 (last_year_at_firm >= y+1)
*       * At least 4 years of tenure at matching year
*       * Non-missing age, sex, sector, province
*   - Matching variable: worker age (quadratic)
*   - Cells: NAICS2 x province x sex x 5-year age bin
*   - PSM: psmatch2, caliper 1, 1:1 nearest-neighbor without replacement
*
* The matched worker panel spans 2001-2017 with t = -6 to +5 relative to
* the event year. ds_6 (t = -1) is set to zero as the base period.
*
* Inputs : $data/worker_YYYY.dta, $data/firm_matched_list.dta
* Outputs:
*   $data/worker_matched_list.dta  - Matched pair IDs (worker x firm x year_prior)
*   $data/worker_matched.dta       - Full event-study panel of matched workers
********************************************************************************

//------------------------------------------------------------------------------
// STEP 1: PROPENSITY SCORE MATCHING WITHIN EACH YEAR (worker_eligible_YYYY.dta)
//------------------------------------------------------------------------------
global start_year = 2005
global end_year = 2016

forvalues y = $start_year/$end_year {

	use $data/worker_`y', replace

	gen year_prior = year

	//------------------------------------------------------------------
	// Restrict to workers at matched firms
	// joinby links each worker to the firm-pair structure established by
	// matching_firm.do. After this, treated workers = at M&A firms;
	// control workers = at matched control firms.
	//------------------------------------------------------------------
	joinby entid_syn year_prior using $data/firm_matched_list
	drop pscore_og

	//------------------------------------------------------------------
	// Earnings restriction: real total earnings >= ~CAD 3,900
	//------------------------------------------------------------------
	gegen 	total_wage 		= sum(t4earn), by(casenum2019)
	gen		real_total_wage = total_wage/CPI_base_2011
	drop if real_total_wage < 3900 | mi(real_total_wage)

	//------------------------------------------------------------------
	// Worker eligibility restrictions
	//------------------------------------------------------------------
	* Drop moonlighters (workers with simultaneous jobs in any year)
	drop if moonlighter == 1

	* Drop workers with gaps in their employment history (max_gap > 1)
	drop 	if max_gap > 1 | mi(max_gap)

	* Drop workers who leave their current firm before t = 0
	* (i.e., not present at the treated firm in the event year)
	gen 	present_at_treated = (`y' + 1 <= last_year_at_firm)
	drop 	if present_at_treated == 0

	* Require at least 4 years of tenure (continuous attachment to the firm)
	drop 	if tenure < 4

	* Drop workers with missing demographic or sector information
	drop if mi(t1_age_recorded) | mi(t1_sex_recorded) | mi(naics2) | mi(OPAddressProvince)

	//------------------------------------------------------------------
	// Age bins: 5-year groups starting from age 17
	// (17-21 -> bin 0, 22-26 -> bin 1, etc.)
	//------------------------------------------------------------------
	gen bin_age = int(( t1_age_recorded - 17)/5)

	* Cell = sector x province x sex x age bin
	* Workers can only be matched to peers within the same cell
	gegen cell = group(naics2 OPAddressProvince t1_sex_recorded bin_*)
	drop if cell == .

	//------------------------------------------------------------------
	// Estimate propensity score (quadratic in age)
	//------------------------------------------------------------------
	reg treated c.t1_age_recorded#c.t1_age_recorded

	* Record R2 statistics
    local r2alist 	"`r2alist' `e(r2_a)'"
	local r2list	"`r2list' `e(r2)'"
	dis `r2alist'
	dis `r2list'

	predict pscore

	gen pscore_og = pscore

	* Enforce within-cell matching by adding a large cell offset
	replace pscore = pscore + 10.0*cell

	* Sort for replication; random outcome for tie-breaking
	gen outcome = rnormal()
	gsort casenum2019

	* 1:1 nearest-neighbor PSM without replacement, caliper 1
	psmatch2 treated, outcome(outcome) pscore(pscore) caliper(1) n(1) noreplacement
	sum outcome

	gen 	id = _id if treated==0 & _weight==1
	replace id = _n1 if treated==1 & _weight==1

	drop pairid
	gegen pairid = group(id)
	gsort pairid

	save $data/worker_eligible_`y', replace
}


//------------------------------------------------------------------------------
// STEP 2: SUMMARIZE PSM MODEL FIT
//------------------------------------------------------------------------------
clear
set obs `: word count `r2alist''

gen adjR2 = .
local i = 1
foreach r2 of local r2alist {
    replace adjR2 = `r2' in `i'
    local ++i
}

gen R2 = .
local i = 1
foreach r2 of local r2list {
    replace R2 = `r2' in `i'
    local ++i
}

summ R2
scalar avg_R2 = r(mean)
summ adjR2
scalar avg_adjR2 = r(mean)

display "Average R2 across matching years = " avg_R2
display "Average adjusted R2 across matching years = " avg_adjR2


//------------------------------------------------------------------------------
// STEP 3: COMPILE MATCHED WORKER LIST (worker_matched_list.dta)
//------------------------------------------------------------------------------
forvalues y = $start_year/$end_year {
	use $data/worker_eligible_`y', replace

	* Keep only successfully matched workers
	drop if mi(id)
	keep casenum2019 entid_syn treated pairid year_prior pscore_og

	save $data/worker_matched_list_`y', replace
}

drop _all
forvalues y = $start_year/$end_year {
	dis( "`y'")
	append using $data/worker_matched_list_`y', keep(casenum2019 entid_syn treated pairid year_prior pscore_og) force
}

compress
save $data/worker_matched_list, replace


//------------------------------------------------------------------------------
// STEP 4: BUILD THE FULL MATCHED WORKER PANEL (2001-2017)
//
// For each year, merge worker records (worker_YYYY) with the matched list.
// For each matched worker, we track their dominant firm (the M&A firm or its match).
// If a treated worker left their M&A firm, we still track where they work.
//------------------------------------------------------------------------------
forvalues y = 2001/2017 {

	use $data/worker_`y', clear

	* Rename before joinby to preserve the original employer ID
	rename entid_syn temp_id

	* Link workers to their matched pair (some workers appear in multiple pairs
	* if they were matched in different years)
	joinby casenum2019 	using $data/worker_matched_list, unmatched(none)

	* Merge in M&A firm ID to identify whether the worker's current employer
	* is the same M&A firm they were matched at (same_firm == 1)
	merge m:1 entid_syn using $data/first_mna, keep(1 3) keepusing(entid_syn) nogen

	* Flag: is the worker still at the (M&A) firm they were matched to?
	gen 	same_firm = (temp_id == entid_syn)
	drop 	entid_syn
	rename	temp_id entid_syn

	* Total wages across all jobs for the worker-pair-year observation
	gegen total_wage = sum(t4earn), by(casenum2019 pairid year_prior)

	* Deduplicate: keep the observation where the worker is at the M&A firm
	* (or their highest-paying job if no longer at the M&A firm)
	gsort casenum2019 year_prior -same_firm -t4earn entid_syn
	duplicates drop casenum2019 year_prior pairid, force

	drop same_firm

	compress
	save $data/worker_matched_`y', replace
}

drop _all
forvalues y = 2001/2017 {
	append using $data/worker_matched_`y', force
	erase $data/worker_matched_`y'.dta
}

save $data/worker_matched_intermid, replace


//------------------------------------------------------------------------------
// STEP 5: CONSTRUCT THE EVENT-STUDY PANEL (worker_matched.dta)
//
// Adds all variables needed for worker event-study regressions:
//   - Numeric IDs (worker, firm, pair)
//   - Event time variable t and ds_k dummies
//   - Sector codes (NAICS2)
//   - M&A deal characteristics (acquirer flag, deal type)
//   - Job mobility variables (moved, moved_sec, moved_ind)
//   - Separation reason (fired vs. quit)
//   - Pair-level variables (matched_acq, matched_emoved)
//   - Present-at-firm indicator (for stayer analysis)
//------------------------------------------------------------------------------
gsort pairid year_prior -treated year

* Numeric IDs
gegen worker_id	= group(casenum2019)
gegen firm_id 	= group(entid_syn)
gegen id 		= group(worker_id year_prior)   // unique worker x matching-year ID

//------------------------------------------------------------------
// Event time variable and indicator dummies (same structure as matching_firm.do)
//------------------------------------------------------------------
gen 	t = year - (year_prior + 1)
replace t = 6 	if t >  5 	& ~mi(t)
replace t = -6 	if t < -5 	& ~mi(t)
tab t, gen(ds_)
levelsof t, local(ts)
local end = r(r)
forvalue i = 1(1)`end' {
	local temp = `i' - 7
	label variable ds_`i' "`temp'"
}
replace ds_6 = 0   // t = -1 is the omitted base period

//------------------------------------------------------------------
// Sector: 2-digit NAICS (same grouping as elsewhere)
//------------------------------------------------------------------
destring naics, replace
drop naics2
gen 	naics2 = int(naics/100)
replace naics2 = 31 if naics2 == 32 | naics2 == 33
replace naics2 = 44 if naics2 == 45
replace naics2 = 48 if naics2 == 49
replace naics2 = 54 if naics2 == 56 | naics2 == 61 | naics2 == 62

* Record sector at event year (used for subsample definitions)
gen naics2_event 	= naics2 if treated == 1 & year == year_prior
gen naics_event		= naics  if year == year_prior
gen firm_event 		= firm_id if year == year_prior

* Carry event-year sector forward/backward to fill gaps for the original firm
gegen naics_tmp 	= firstnm(naics_event), by(id)
gegen firmid_tmp 	= firstnm(firm_event),  by(id)
replace naics 		= naics_tmp if firm_id == firmid_tmp & year <= year_prior + 1

//------------------------------------------------------------------
// M&A deal characteristics: only retain at the event year (t = -1)
//------------------------------------------------------------------
merge	m:1 entid_syn 	using 	$data/first_mna, keep(1 3) keepusing(DEAL Acquirer other_party_id merger) nogen
replace DEAL = . 			if t ~= -1 & treated == 1
replace Acquirer = . 		if t ~= -1 & treated == 1
replace merger = .  		if t ~= -1 & treated == 1
replace other_party_id = ""	if t ~= -1 & treated == 1

//------------------------------------------------------------------
// Job mobility variables (computed from the matched worker panel)
//------------------------------------------------------------------
gsort id year

* moved == 1 if the worker changed firm or had a gap of >= 2 years
by id: 	gen year_diff	= year - year[_n-1]
by id: 	gen moved 		= (firm_id ~= firm_id[_n-1] | year_diff >= 2) & ~mi(year_diff)

* Summary mobility measures over the post-event window (t > 0)
gegen	total_moves		= total(moved * (t > 0) ), 			by(id)   // total post-event moves
gegen	ever_moved_post	= max(moved * (t > 0)), 			by(id)   // any post-event move
gegen	first_move_year	= min(year) if moved == 1 & t > 0, 	by(id)   // year of first post-event move
gen		t_moved			= first_move_year - (year_prior + 1)           // event time of first move

* Sector change indicator (changed 2-digit NAICS sector at a job change)
by id:	gen moved_sec = (naics2 ~= naics2[_n-1]) if moved == 1
replace moved_sec = 0 if mi(moved_sec)

* Industry change indicator (changed 6-digit NAICS at a job change)
by id:	gen moved_ind = (naics ~= naics[_n-1]) if moved == 1
replace moved_ind = 0 if mi(moved_ind)

//------------------------------------------------------------------
// Separation reason (fired vs. quit)
// Based on the ROE "reason for separation" code:
*   1 = shortage of work, 7 = leave of absence, 11 = other
* fired == 1 means employer-initiated separation
//------------------------------------------------------------------
gen 	fired	=	1	if reason == 1 | reason == 7 | reason == 11
replace fired 	= 	0	if fired ~= 1 & ~mi(reason)
by id:	replace fired	=	. if moved[_n+1] == 0            // only meaningful when a move follows
replace fired	=	. if first_move_year[_n+1] ~= year[_n+1]  // only at the year of the move
replace fired	=	. if treated == 0                          // only for treated workers

//------------------------------------------------------------------
// Pair-level variables
//------------------------------------------------------------------
gsort pairid year_prior -treated year
gegen matched_acq           = firstnm(Acquirer), 			by(pairid year_prior)   // acquirer pair
gegen matched_emoved		= firstnm(ever_moved_post), 	by(pairid year_prior)   // mobile pair

//------------------------------------------------------------------
// Present-at-firm indicator
// present_at_firm == 1 if the worker is still at their original matched firm
// (used to define the "stayer" subsample in worker analysis)
//------------------------------------------------------------------
gen 	original_firm_tmp	= firm_id if year == year_prior
gegen 	original_firm 		= firstnm(original_firm_tmp), by(id)
replace original_firm 		= . if firm_id ~= original_firm

gen 	present_at_firm	= 1 if original_firm == firm_id & t <= 0
replace present_at_firm = 1 if original_firm == firm_id & t > 0 & moved == 0

compress
save $data/worker_matched, replace
